As usual, we start doing some magic to load R scripts.


In [1]:
%load_ext rmagic


During startup - Warning messages:
1: Setting LC_TIME failed, using "C" 
2: Setting LC_MONETARY failed, using "C" 
3: Setting LC_PAPER failed, using "C" 
4: Setting LC_MEASUREMENT failed, using "C" 

In [2]:
%%R

dir = "19-05-2014"

options(stringsAsFactors=F)

mycon = gzcon(gzfile(paste(dir, "buggy_traces.csv.gz", sep="/"), open="r"))
buggy_program_events = read.csv(textConnection(readLines(mycon)), sep="\t", header = F)


mycon = gzcon(gzfile(paste(dir, "robust_traces.csv.gz", sep="/"), open="r"))
robust_program_events = read.csv(textConnection(readLines(mycon)), sep="\t", header = F)

print(nrow(robust_program_events))
print(nrow(buggy_program_events))

programs = c(levels(buggy_program_events[,1]),levels(robust_program_events[,1]))
cats = factor(c(robust_program_events[,4], buggy_program_events[,4]), levels = c("R","B"))

#write.csv(programs,paste(dir,"programs.csv", sep="/"))


[1] 182
[1] 546

TODO: Add an explanation about program traces as documents.

Now, we load the tm package and create the corpuses from the "documents".


In [3]:
%%R

library(tm)

mut_corpus = Corpus(VectorSource(c(robust_program_events[,2],buggy_program_events[,2])))
evs_corpus = Corpus(VectorSource(c(robust_program_events[,3],buggy_program_events[,3])))

print(mut_corpus)
print(evs_corpus)


A corpus with 728 text documents
A corpus with 728 text documents

Now, it is time to create the document matrixes, and convert them to data frames adding its correspondent classes. The function "inspect" to convert the document term matrix into a data frame prints some information that we don't care, so we discard printed messages in this step.


In [4]:
%%R

#library("RWeka")
#options(mc.cores=1)


mut_dm = DocumentTermMatrix(mut_corpus)

sink("/dev/null")

mut_dm_df =  as.data.frame(inspect(mut_dm))
rownames(mut_dm_df) = 1:nrow(mut_dm)
mut_dm_df["class"] = cats

sink()

#BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2, delimiters=" "))
evs_dm = DocumentTermMatrix(evs_corpus, control = list(bounds = list(global = c(1,Inf))))#, tokenize = BigramTokenizer))
print(evs_dm)

sink("/dev/null")

v = sort(colSums(as.matrix(evs_dm)), decreasing=TRUE)
cols = names(head(v, 100))

evs_dm_df =  (as.data.frame(inspect(evs_dm)))
#rownames(evs_dm_df) = 1:nrow(evs_dm)
evs_dm_df = evs_dm_df[,cols]
#print(2)
evs_dm_df["class"] = cats

sink()


A document-term matrix (728 documents, 782 terms)

Non-/sparse entries: 27754/541542
Sparsity           : 95%
Maximal term length: 36 
Weighting          : term frequency (tf)

To finish preparing the data, we need make sure we are using the same variables for all the corpuses in the mutation and events data.


In [5]:
%%R

# Mutation data

mut_robust_cases = mut_dm_df[mut_dm_df$class == "R",]
mut_buggy_cases  = mut_dm_df[mut_dm_df$class == "B",]

# Event data

evs_robust_cases = evs_dm_df[evs_dm_df$class == "R",]
evs_buggy_cases  = evs_dm_df[evs_dm_df$class == "B",]

both_robust_cases = cbind(mut_robust_cases[,names(mut_robust_cases) != "class"], evs_robust_cases)
both_buggy_cases = cbind(mut_buggy_cases[,names(mut_buggy_cases) != "class"], evs_buggy_cases)

print(nrow(both_robust_cases))
print(nrow(both_buggy_cases))

print(ncol(both_robust_cases))
print(ncol(both_buggy_cases))


[1] 397
[1] 331
[1] 680
[1] 680

In [6]:
%%R

robust_cases = both_robust_cases
buggy_cases = both_buggy_cases

#rm(buggy_program_events)
#gc()

now, we are ready to select train and test..


In [7]:
%%R

train_size = 250
test_size = nrow(buggy_cases) - train_size

print(train_size)
print(test_size)

n = nrow(buggy_cases)
rsample = sample(n)

train_sample = rsample[1:(train_size)] 
test_sample = rsample[(train_size+1):(train_size+test_size)]

#print(rsample)

buggy_train = buggy_cases[train_sample,]
buggy_test  = buggy_cases[test_sample,]

print(nrow(buggy_train))
print(nrow(buggy_test))

# robust train and test

n = nrow(robust_cases)
rsample = sample(n)

#print(rsample)

# n cases are selected to keep the train dataset balanced
train_sample = rsample[1:(train_size)]
test_sample =  rsample[(train_size+1):(train_size+test_size)]
more_test_sample = rsample[(train_size+test_size+1):n]

robust_train = robust_cases[train_sample,]
robust_test  = robust_cases[test_sample,]
robust_more_test = robust_cases[more_test_sample,]

print(nrow(robust_train))
print(nrow(robust_test))

train = rbind(buggy_train, robust_train)
test  = rbind(buggy_test, robust_test)
more_test = robust_more_test

#print(more_test[1,])


[1] 250
[1] 81
[1] 250
[1] 81
[1] 250
[1] 81

Finally, we are ready to train and test a knn model:

or a SVM ..


In [11]:
%%R
library("e1071")
library("caret")

xy_train = train
xy_train[,"class"] = factor(train[,"class"])
x_test = test[,names(test) != "class"]
y_test  = test[,"class"]

x_more_test = more_test[,names(test) != "class"]
y_more_test  = more_test[,"class"]

#m = svm(class ~., data=xy_train, gamma=0.1, cost=10)#, kernel="linear")
m = svm(class ~., data=xy_train, gamma=0.001, cost=100)#, kernel="linear")
#scores = t(abs(t(m$coefs) %*% m$SV))
#inds = sort(scores, decreasing=TRUE, index.return = TRUE)$i
#print(scores[inds,])

#m = tune.svm(class~., data = xy_train,  gamma = 10^(-5:-1), cost = 10^(1:2))
#print(summary(m))
#m = m$best.model

#m = svm(class ~., data=xy_train, gamma=0.01, cost=100)#, kernel="linear")

z = predict(m,x_test)
#print(z)
#print(y_test)
print(confusionMatrix(table(pred=z, true=y_test)))

z = predict(m,x_more_test)
#print(z)
print(confusionMatrix(table(pred=z, true=y_more_test)))


Confusion Matrix and Statistics

    true
pred  R  B
   R 66 13
   B 15 68
                                       
               Accuracy : 0.8272       
                 95% CI : (0.76, 0.882)
    No Information Rate : 0.5          
    P-Value [Acc > NIR] : <2e-16       
                                       
                  Kappa : 0.6543       
 Mcnemar's Test P-Value : 0.8501       
                                       
            Sensitivity : 0.8148       
            Specificity : 0.8395       
         Pos Pred Value : 0.8354       
         Neg Pred Value : 0.8193       
             Prevalence : 0.5000       
         Detection Rate : 0.4074       
   Detection Prevalence : 0.4877       
      Balanced Accuracy : 0.8272       
                                       
       'Positive' Class : R            
                                       
Confusion Matrix and Statistics

    true
pred  R  B
   R 51  0
   B 15  0
                                         
               Accuracy : 0.7727         
                 95% CI : (0.653, 0.8669)
    No Information Rate : 1              
    P-Value [Acc > NIR] : 1.0000000      
                                         
                  Kappa : 0              
 Mcnemar's Test P-Value : 0.0003006      
                                         
            Sensitivity : 0.7727         
            Specificity :     NA         
         Pos Pred Value :     NA         
         Neg Pred Value :     NA         
             Prevalence : 1.0000         
         Detection Rate : 0.7727         
   Detection Prevalence : 0.7727         
      Balanced Accuracy :     NA         
                                         
       'Positive' Class : R              
                                         

In [12]:
%%R

scores = t(abs(t(m$coefs) %*% m$SV))
inds = sort(scores, decreasing=TRUE, index.return = TRUE)$i
print(names(scores[inds,][1:50]))


 [1] "X.strlen.ret_val.num32b8."          "X.strlen.0.hptr32."                
 [3] "X.strchr.1.num32b8."                "X.strchr.0.lptr32."                
 [5] "X.strcpy.ret_val.hptr32."           "X.strcpy.0.hptr32."                
 [7] "X.strlen.0.gptr32."                 "X.memcpy.1.hptr32."                
 [9] "X.strcpy.1.hptr32."                 "X.strchr.ret_val.nptr32."          
[11] "X.__ctype_b_loc.ret_val.fptr32."    "X.__ctype_b_loc.0.top32."          
[13] "X.memcpy.0.hptr32."                 "X.memcpy.ret_val.hptr32."          
[15] "X.memcpy.2.num32b8."                "X._io_getc.ret_val.num32b8."       
[17] "X.strchr.ret_val.lptr32."           "X.fread.1.num32b8."                
[19] "X.fread.3.hptr32."                  "X.fread.2.num32b8."                
[21] "X.fread.0.sptr32."                  "X.fread.ret_val.num32b0."          
[23] "X.strlen.0.lptr32."                 "X.tolower.0.num32b8."              
[25] "X.tolower.ret_val.num32b8."         "X._io_getc.0.hptr32."              
[27] "X.memcpy.1.lptr32."                 "X.realloc.ret_val.hptr32."         
[29] "X.realloc.1.num32b8."               "X.strcmp.0.hptr32."                
[31] "X.memset.0.hptr32."                 "X.memset.ret_val.hptr32."          
[33] "X.realloc.0.nptr32."                "X.memset.2.num32b16."              
[35] "X.memset.1.num32b0."                "X.__errno_location.0.top32."       
[37] "X.__errno_location.ret_val.fptr32." "X.fgets.0.sptr32."                 
[39] "X.fgets.ret_val.sptr32."            "X.fgets.2.hptr32."                 
[41] "X.fgets.1.num32b16."                "X.strcmp.1.sptr32."                
[43] "X.vfprintf.0.lptr32."               "X.vfprintf.2.sptr32."              
[45] "X.vfprintf.ret_val.num32b8."        "X.fprintf.1.gptr32."               
[47] "X.vfprintf.1.gptr32."               "X.fprintf.0.lptr32."               
[49] "X.fputc.0.num32b8."                 "X.fputc.1.lptr32."                 

In [13]:
%%R
m_vars = names(xy_train)

save(m_vars, file=paste(dir, "svms", "mvars.data", sep="/"))
save(m, file=paste(dir, "svms", "mutation-event-classifier.svm", sep="/"))

In [ ]: